July 28 - August 6, 2016
This is another small dataset (~3000 tweets), so semantic insight into bitcoin in a way done with charisma data is probably not possible - still may be other interesting patterns.
For this example no attempt is made to remove spammy or repetitive posts.
In general Twords was designed to look at frequencies and semantics for terms likely to be used in casual conversation (like "charisma") more so than terms that likely have a lot of marketing behind them (like "brexit" or "bitcoin"), but people specifically interested in a term like "bitcoin" may still find interesing patterns here.
In [1]:
import sys
sys.path.append('..')
from twords.twords import Twords
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
# this pandas line makes the dataframe display all text in a line; useful for seeing entire tweets
pd.set_option('display.max_colwidth', -1)
In [2]:
twit = Twords()
twit.data_path = "../data/java_collector/bitcoin/"
twit.background_path = '../jar_files_and_background/freq_table_72319443_total_words_twitter_corpus.csv'
twit.create_Background_dict()
twit.set_Search_terms(["bitcoin"])
twit.create_Stop_words()
In [3]:
twit.get_java_tweets_from_csv_list()
In [4]:
# find how many tweets we have in original dataset
print "Total number of tweets:", len(twit.tweets_df)
In [5]:
twit.keep_column_of_original_tweets()
twit.lower_tweets()
twit.keep_only_unicode_tweet_text()
twit.remove_urls_from_tweets()
twit.remove_punctuation_from_tweets()
twit.drop_non_ascii_characters_from_tweets()
twit.drop_duplicate_tweets()
twit.drop_by_search_in_name()
twit.convert_tweet_dates_to_standard()
twit.sort_tweets_by_date()
In [6]:
len(twit.tweets_df)
Out[6]:
In [7]:
twit.keep_tweets_with_terms("bitcoin")
In [8]:
len(twit.tweets_df)
Out[8]:
In [9]:
twit.create_word_bag()
twit.make_nltk_object_from_word_bag()
twit.create_word_freq_df(1000)
In [10]:
twit.word_freq_df.sort_values("log relative frequency", ascending = False, inplace = True)
twit.word_freq_df.head(20)
Out[10]:
In [11]:
num_words_to_plot = 32
background_cutoff = 100
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [12]:
num_words_to_plot = 32
background_cutoff = 500
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [13]:
twit.tweets_containing("decline")[:10]
Out[13]:
In [14]:
num_words_to_plot = 32
background_cutoff = 2000
twit.word_freq_df[twit.word_freq_df['background occurrences']>background_cutoff].sort_values("log relative frequency", ascending=True).set_index("word")["log relative frequency"][-num_words_to_plot:].plot.barh(figsize=(20,
num_words_to_plot/2.), fontsize=30, color="c");
plt.title("log relative frequency", fontsize=30);
ax = plt.axes();
ax.xaxis.grid(linewidth=4);
In [15]:
twit.tweets_containing("confirmed")[:10]
Out[15]:
In [ ]: